

library(tidyverse) 
library(haven) # part of the tidyverse for reading in stata; should not need to load it in addition to tidyverse
library(dplyr) 
library(forcats) # for categorical variables (R for data science rec) --> see https://rdrr.io/cran/forcats/man/fct_unify.html
library(stringr) # for manipulating string variables (R for data science rec)
library(data.table) # this has some useful features like rbindlist
#library(lubridate) # for dates and times (R for data science rec)
#instalibrary(dummies) # to easily create dummies
library(ggplot2) 
library(ggrepel) # to avoid text labels in ggplot from overlapping
library(modelr) # to use "add_predictions()" for adding a column of predicted vals to your dataset
library(broom) # to create tidy data from model output
#library(margins) # R equivalent of Stata's margins command --> Thomas Leeper said this only to be used for marginal effects (not prediction)
#library(prediction) # Thomas Leeper's R package to get predicted probabilities
library(srvyr)  # survey package that also works with dplyr 
library(lmtest) # for likelihood ratio tests
#library(sandwich) # for robust standard errors 
#library(multiwayvcov) # for clustered standard errors
library(miceadds) # package to cluster SEs more easily than in multiwayvcov; it uses multiwayvcov, so the results between the two packages are exactly the same. 
#library(glmmML)  # Allows for fast computation of logits and poissons with large number of fixed effects
library(speedglm)
#library(data.table) # this has some useful features like rbindlist
#library(lme4) # for multi-level modeling
#library(lmerTest) # for p-values with the lmer command
#library(sjPlot) # for plotting lmer models
#library(texreg) # for tables
library(tableone) # Creates a table 1 (summary characteristics)
#library(mice) # md.pattern() function to see patterns of missing data 
library(reshape) # to use the rescalar function
library(haven)
#library(car) # for easy attaching of new variables
#library(arm)
#library(mosaic)
#library(mosaicData)
#library(mediation)  # for mediation analysis
#library(lattice)
#library(pander)

install.packages("haven")

setwd("~/Desktop/Public Health/original data cleaning code")

#### WOMen
india.women<- read_dta("~/iCloud Drive (Archive)/Documents/Public Health Files/Public Health/Multiple Morbidities/Datasets/IAIR73DT/IAIR73FL.DTA")



india.women <- india.women %>% 
  mutate( 
          p_id=caseid,
          tobacco_smoked = ifelse(s710c==1 | v463a==1 | v463b==1 | s707==1 | v463e==1 | v463x==1,1,
                                  ifelse(s710c==0 & v463a==0 & v463b==0 & s707==0 & v463e==0 & v463x==0,0,NA)),
           tobacco_smokeless=ifelse(v463c==1 |v463d==1 |v463f==1|v463g==1|s710e==1,1,
                 ifelse(v463c==0& v463d==0&v463f==0&v463g==0&s710e==0,0,NA)))





india.w<-dplyr::select(india.women,p_id, tobacco_smoked, tobacco_smokeless)



#india.women$tobacco_smoked <- as.factor(india.women$tobacco_smoked)
#summary(india.women$tobacco_smoked)
##### MEN                               
india.men <- read_dta("~/iCloud Drive (Archive)/Documents/Public Health Files/Public Health/Multiple Morbidities/Datasets/IAMR73DT/IAMR73FL.DTA")




india.men <- india.men %>% 
  mutate( 
   
    p_id= mcaseid,
    tobacco_smoked = ifelse(sm609c==1 | mv463a==1 | mv463b==1 | sm606==1 | mv463e==1 | mv463x==1,1,
                            ifelse(sm609c==0 & mv463a==0 & mv463b==0 & sm606==0 & mv463e==0 & mv463x==0,0,NA)),
    tobacco_smokeless=ifelse(mv463c==1 |mv463d==1 |mv463f==1|mv463g==1|sm609e==1,1,
                             ifelse(mv463c==0& mv463d==0&mv463f==0&mv463g==0&sm609e==0,0,NA)))
    
india.m  <-dplyr::select(india.men,p_id, tobacco_smoked, tobacco_smokeless)



#india.men$tobacco_smoked <- as.factor(india.men$tobacco_smoked)
#summary(india.men$tobacco_smoked)

DHS.India.updated <- read.csv("~/Documents/Public Health Files/Public Health/public health/DHS.India.updated.csv")
dhs <- DHS.India.updated

# Append and merge
#india.hh= 2869043 obs of 13 variables
#india.women=699686 obs of 64 variables
#india.men=112122 obs of 55 variables
#without filter 811808 observations,with filter 757655
india.ind <- bind_rows(india.w, india.m)
india <- left_join(dhs, india.ind, by=c("p_id"="p_id"))

india <- merge(dhs, india.ind, by="p_id", all.x=TRUE)

dhs$p_id <- trimws(as.character(dhs$p_id))
india.ind$p_id <- trimws(as.character(india.ind$p_id))

is.character(dhs$p_id)
is.character(india.ind$p_id)

              
              
              write_csv(india, "DHS with smoking and smokeless.csv") 
         
india$tobacco_smoked <- as.factor(india$tobacco_smoked)
summary(india$tobacco_smoked)

india$tobacco_smokeless <- as.factor(india$tobacco_smokeless)
summary(india$tobacco_smokeless)

              
                         
              